*********************************************************************
**********************  DATA CONSTRUCTION: CHNS ********************* 
*********************************************************************


cd "D:\Replication\"   /*Change the the directory on your local machine*/


// I. Community characteristics

import sas using "Master_UrbanIndex_201804\urban_11.sas7bdat", clear

* keep variable names consistent across datasets
rename COMMID commid  
rename wave WAVE

* fill in missing values
sum comm denc div econ health house market soc trans edc mart  
egen sum_index=rowtotal(comm denc div econ health house market soc trans edc mart)
egen check=diff(index sum_index)  
tab check

egen sum_index2=rowtotal(denc div econ health house market soc trans edc mart)
replace comm=index-sum_index2 if comm==.

drop check sum_index*

save urban_index.dta, replace


// II. Community-level income distribution

* import household income
import sas using "Master_Constructed_Income_201804\hhinc_10.sas7bdat", clear

* gross (posititve) household income per capita
g hhincpcgross_cpi=hhincgross_cpi/hhsize
replace hhincpcgross_cpi=0 if hhincpcgross_cpi==. & hhincpc_cpi~=.

egen group=group(commid WAVE)

gen gini = .
gen theil = .
gen pctratio_p90p10 = .
gen pctratio_p75p25 = .

sum group
return list

* Income distribution indices are generated by ineqdeco (Jenkins, 1999) and ineqdec0 (Jenkins, 1999)
* Stephen P. Jenkins, 1999. "INEQDECO: Stata module to calculate inequality indices with decomposition by subgroup," 
* Statistical Software Components S366002, Boston College Department of Economics, revised 15 Feb 2021.
*
* Stephen P. Jenkins, 1999. "INEQDEC0: Stata module to calculate inequality indices with decomposition by subgroup," 
* Statistical Software Components S366007, Boston College Department of Economics, revised 15 Feb 2021.


forvalues i=1/`r(max)'{
	qui ineqdeco hhincpcgross_cpi if group==`i'  
	qui replace gini = r(gini) if group==`i'
	qui replace theil = r(ge1) if group==`i'
	qui replace pctratio_p90p10 = r(p90p10) if group==`i'
	qui replace pctratio_p75p25 = r(p75p25) if group==`i'
}

* negative (net) household income per capita

gen gini_net = .
gen pctratio_net_p90p10 = .
gen pctratio_net_p75p25 = .

sum group
return list

forvalues i=1/`r(max)'{
	qui ineqdec0 hhincpc_cpi if group==`i' 
	qui replace gini_net = r(gini) if group==`i'
	qui replace pctratio_net_p90p10 = r(p90p10) if group==`i'
	qui replace pctratio_net_p75p25= r(p75p25) if group==`i'
	
}

keep commid WAVE gini* theil pctratio*
duplicates drop 

save gini_hh.dta, replace


* generate income percentiles

import sas using "Master_Constructed_Income_201804\hhinc_10.sas7bdat", clear

g hhincpcgross_cpi=hhincgross_cpi/hhsize
replace hhincpcgross_cpi=0 if hhincpcgross_cpi==. & hhincpc_cpi~=.

g Y= hhincpcgross_cpi

#delimit ;
collapse 
(p5)   p5=Y
(p10)  p10=Y
(p15)  p15=Y
(p20)  p20=Y
(p25)  p25=Y
(p30)  p30=Y
(p35)  p35=Y
(p40)  p40=Y
(p45)  p45=Y
(p50)  p50=Y
(p55)  p55=Y
(p60)  p60=Y
(p65)  p65=Y
(p70)  p70=Y
(p75)  p75=Y
(p80)  p80=Y
(p85)  p85=Y
(p90)  p90=Y
(p95)  p95=Y
, by(commid WAVE);

#delimit cr

save incpct_hh.dta, replace

* merge datasets
use gini_hh.dta, clear
merge 1:1 commid WAVE using incpct_hh.dta, keep(match) nogen
merge 1:1 commid WAVE using urban_index.dta, keep(match) nogen

save incdistribution.dta, replace



// III. Individual income

** individual SES
* age
import sas using "Master_ID_201908\surveys_pub_12.sas7bdat", clear 
rename Idind IDind
rename wave WAVE

keep IDind hhid age WAVE commid

save age.dta, replace

* gender
import sas using "Master_ID_201908\mast_pub_12.sas7bdat", clear
rename Idind IDind
g male=GENDER==1
keep IDind male

save gender.dta, replace

* marital status
import sas using "Master_ID_201908\rst_12.sas7bdat", clear
g hhead=A5==0
replace A8=. if A8>5 | A8<1
g married=A8==2
keep hhead married IDind WAVE

save marital.dta, replace

* education
import sas using "Master_Educ_201804\educ_12.sas7bdat", clear 
rename A12 edu
replace edu=. if edu==9
recode edu (0 1 =1)(2=2)(3 4=3)(5 6=4)
label define edu2 1 "no or primary education" 2 "lower middle school" 3 "upper middle or vocational school" 4 "high education"
label values edu edu2
keep IDind WAVE edu
save educ.dta, replace

* job
import sas using "Master_Income_Categories_201804\jobs_12.sas7bdat", clear
rename wave WAVE

rename B2 ifwork
label var ifwork "if presently working"
replace ifwork=. if ifwork<0|ifwork>1

rename B2B rehire
label variable rehire "B2B RETIRED BUT REHIRED"
replace rehire=. if rehire<0 | rehire>1

rename B2C_YR retireyear

rename B3B changejob  
replace changejob=. if changejob<0

rename B4 occupation
recode occupation (1 3=1)(2 4=2)(5=3)(6=4)(7=5)(11=6)(-9 8 9 10 12 13 14 15 16=7)
label define occu 1 "senior professional/admin manager" 2 "junior professional/office staff" 3 "farmer" 4 "skilled worker" 5 "non-skilled worker" 6 "service worker" 7"others"
label values occupation occu

rename B5 position
recode position (1 2=1) (3 =2) (4 = 3) (5 6 7 8 9=4)
label define pos1 1 "self-employed" 2 "permanant" 3 "contract" 4 "others"
label values position pos1

rename B9A secondjob
replace secondjob=. if secondjob<0 | secondjob>1

keep IDind WAVE ifwork rehire retireyear changejob occupation position secondjob
save job.dta, replace


* individual income
import sas using "Master_Constructed_Income_201804\indinc_10.sas7bdat", clear
duplicates drop

rename wave WAVE
sum indbus indfarm indfish indgard indlvst indret indwage

foreach x of varlist indbus indfarm indfish indgard indlvst {
	g `x'_gross=`x'
	replace `x'_gross =0 if `x'<0
}

egen total=rowtotal(indbus_gross indfarm_gross indfish_gross indgard_gross indlvst_gross indret indwage )
replace total=. if indfarm_gross==. & indfish_gross==. & indgard_gross==. & indlvst_gross==. & indret==. & indwage==.
label var total "total positive income"

egen agriculture=rowtotal(indfarm_gross indfish_gross indgard_gross indlvst_gross)
replace agriculture=. if indfarm_gross==. & indfish_gross==. & indgard_gross==. & indlvst_gross==.
label var agriculture "total non-negative income from farming fishing gardening livestock"

g total_cpi=total/CPI2015
g agriculture_cpi=agriculture/CPI2015
g business_cpi=indbus_gross/CPI2015
g wage_cpi=indwage/CPI2015


* merge with individual socioeconomic variables
merge m:1 IDind using gender.dta, keep(match) nogen
merge 1:1 IDind WAVE using age.dta, keep(match) nogen
merge 1:1 IDind WAVE using marital.dta, keep(match) nogen
merge 1:1 IDind WAVE using educ.dta, keep(match) nogen
merge 1:1 IDind WAVE using job.dta, keep(match) nogen

format IDind %15.0f

replace retireyear=. if retireyear<0
replace retireyear=. if retireyear==999

bys IDind: egen retireyr_min=min( retireyear )
bys IDind: egen retireyr_max=max( retireyear )

xfill retireyr_min, i(IDind)
replace retireyear=retireyr_min
g retired=WAVE>retireyear

save indinc_analysis.dta, replace


* source of household income 

import sas using "Master_Constructed_Income_201804\hhinc_10.sas7bdat", clear

sum HHFARM HHFISH hhgard HHLVST HHSUB HHOTHR hhNRwage HHRETIRE HHBUS

foreach x of varlist HHFARM HHFISH hhgard HHLVST HHBUS{
	replace `x' =0 if `x' <0
}

egen agri=rowtotal( HHFARM HHFISH hhgard HHLVST)
egen other=rowtotal( HHSUB HHOTHR )
egen wage=rowtotal( hhNRwage HHRETIRE )
egen business=rowtotal( HHBUS )

foreach x of varlist agri other wage business{
 g `x'_pc_cpi = (`x'/hhsize) / CPI2015
}

egen total_pc_cpi=rowtotal(agri_pc_cpi other_pc_cpi wage_pc_cpi business_pc_cpi)

save hhinc_source.dta, replace

